from google.colab import drive
drive.mount("/content/MyDrive")
Mounted at /content/MyDrive
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import zscore
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import svm
from scipy.spatial.distance import cdist
from scipy.cluster.hierarchy import linkage,cophenet,dendrogram
from scipy.spatial.distance import pdist
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.decomposition import PCA
df1=pd.read_csv('/content/MyDrive/MyDrive/Dataset/Car name.csv')
df2=pd.read_json('/content/MyDrive/MyDrive/Dataset/Car-Attributes.json')
df1.head() #Showing the first 5 datapoints
| car_name | |
|---|---|
| 0 | chevrolet chevelle malibu |
| 1 | buick skylark 320 |
| 2 | plymouth satellite |
| 3 | amc rebel sst |
| 4 | ford torino |
df1.shape #checking the no of rows and columns in dataset
(398, 1)
df2.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
df2.shape
(398, 8)
df=df1.join(df2,how='left') #Joining the two dataframes
df.head()
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
df.shape
(398, 9)
df.info() #info of the complete dataset
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null object 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
df.isnull().sum() #check for null values
car_name 0 mpg 0 cyl 0 disp 0 hp 0 wt 0 acc 0 yr 0 origin 0 dtype: int64
for value in df.columns:
print(value,":", np.sum(df[value]=="?")) #check for question mark
car_name : 0 mpg : 0 cyl : 0 disp : 0 hp : 6 wt : 0 acc : 0 yr : 0 origin : 0
df3=df.copy(deep=True )
df3.replace( to_replace="?",value=np.nan,inplace=True)
df3.isnull().sum()
car_name 0 mpg 0 cyl 0 disp 0 hp 6 wt 0 acc 0 yr 0 origin 0 dtype: int64
df3.drop(axis=1, columns='car_name', inplace=True)
df3.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 |
median_filler= lambda x: x.fillna(x.median()) #imputing the missing values
df3=df3.apply(median_filler,axis=0)
df3.duplicated().sum()
0
df3.describe() #statistical summary
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| count | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 |
| mean | 23.514573 | 5.454774 | 193.425879 | 104.304020 | 2970.424623 | 15.568090 | 76.010050 | 1.572864 |
| std | 7.815984 | 1.701004 | 104.269838 | 38.222625 | 846.841774 | 2.757689 | 3.697627 | 0.802055 |
| min | 9.000000 | 3.000000 | 68.000000 | 46.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 |
| 25% | 17.500000 | 4.000000 | 104.250000 | 76.000000 | 2223.750000 | 13.825000 | 73.000000 | 1.000000 |
| 50% | 23.000000 | 4.000000 | 148.500000 | 93.500000 | 2803.500000 | 15.500000 | 76.000000 | 1.000000 |
| 75% | 29.000000 | 8.000000 | 262.000000 | 125.000000 | 3608.000000 | 17.175000 | 79.000000 | 2.000000 |
| max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 |
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(20,6));
plt.subplot(1,3,1);
sns.distplot(df3['mpg'],color='green');
plt.title('Milage')
plt.subplot(1,3,2);
sns.distplot(df3['cyl'],color='blue');
plt.title('Cylinder')
plt.subplot(1,3,3);
sns.distplot(df3['disp'],color='red');
plt.title('Cylinder displacement')
plt.figure(figsize=(20,6));
plt.subplot(1,3,1);
sns.boxplot(y=df3['hp'],color='yellow');
plt.title('Horsepower')
plt.subplot(1,3,2);
sns.boxplot(y=df3['wt'],color='orange');
plt.title('Weight')
plt.subplot(1,3,3);
sns.boxplot(y=df3['acc'],color='red');
plt.title('acc')
Text(0.5, 1.0, 'acc')
Milage tends to follow a normal distribution whilst other attributes are not. Horsepower and Acc has a few outliers
plt.figure(figsize=(20,6));
plt.subplot(1,3,1);
sns.distplot(df3['hp'],color='green');
plt.title('Horesepower')
plt.subplot(1,3,2);
sns.distplot(df3['wt'],color='blue');
plt.title('Weight')
plt.subplot(1,3,3);
sns.distplot(df3['acc'],color='red');
plt.title('acc')
plt.figure(figsize=(20,6));
plt.subplot(1,3,1);
sns.boxplot(y=df3['mpg'],color='yellow');
plt.title('Milage')
plt.subplot(1,3,2);
sns.boxplot(y=df3['cyl'],color='orange');
plt.title('Cylinder')
plt.subplot(1,3,3);
sns.boxplot(y=df3['disp'],color='red');
plt.title('Cylinder displacement')
Text(0.5, 1.0, 'Cylinder displacement')
Acc follows a normal distribution whereas the horsepower and weight variables are slightly skewed. There are no much outliers in the five point summary.
sns.countplot(data=df3,x='cyl');
plt.figure(figsize = (15,7))
sns.heatmap(df3.corr(), cmap='plasma',annot=True, fmt='.2f');
Strong and weak correlations can be spotted among various attributes. Highest correlation can be seen between hp and cyl.
sns.scatterplot(data=df3,x='wt',y='disp',hue='cyl');
More the no. of cylinders, higher the weight and cylinder displacement.
sns.scatterplot(data=df3,x='wt',y='mpg',hue='cyl');
As the no of cylinders are more, weight is more but milage is reducing.
sns.stripplot(data=df3,x='cyl',y='mpg',hue='origin');
sns.jointplot(data=df3,x='wt',y='hp',kind='hex');
sns.lmplot(data=df3,x='acc',y='mpg');
sns.jointplot(data=df3,x='hp',y='acc',hue='cyl');
sns.pairplot(data=df3, diag_kind='kde');
df3.columns
Index(['mpg', 'cyl', 'disp', 'hp', 'wt', 'acc', 'yr', 'origin'], dtype='object')
df_scaled = df3.apply(zscore) #Standardizing the data
df_scaled.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | -0.706439 | 1.498191 | 1.090604 | 0.673118 | 0.630870 | -1.295498 | -1.627426 | -0.715145 |
| 1 | -1.090751 | 1.498191 | 1.503514 | 1.589958 | 0.854333 | -1.477038 | -1.627426 | -0.715145 |
| 2 | -0.706439 | 1.498191 | 1.196232 | 1.197027 | 0.550470 | -1.658577 | -1.627426 | -0.715145 |
| 3 | -0.962647 | 1.498191 | 1.061796 | 1.197027 | 0.546923 | -1.295498 | -1.627426 | -0.715145 |
| 4 | -0.834543 | 1.498191 | 1.042591 | 0.935072 | 0.565841 | -1.840117 | -1.627426 | -0.715145 |
from sklearn.metrics import silhouette_samples, silhouette_score
#Finding the best no of clusters
clusters=range(1,10)
meanDistortions=[]
score = []
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(df_scaled)
prediction=model.predict(df_scaled)
meanDistortions.append(sum(np.min(cdist(df_scaled, model.cluster_centers_, 'euclidean'), axis=1)) / df_scaled.shape[0])
label=model.labels_
score.append(label)
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
Text(0.5, 1.0, 'Selecting k with the Elbow Method')
for i in range(2,10):
model=KMeans(n_clusters=i)
model.fit(df_scaled)
labels=model.labels_
print("Score of ",i,"clusters",silhouette_score(df_scaled,labels)) #Finding the silhouette score
Score of 2 clusters 0.4148919831294719 Score of 3 clusters 0.3246673901601845 Score of 4 clusters 0.3123489662662654 Score of 5 clusters 0.33296973781495653 Score of 6 clusters 0.33652162036030914 Score of 7 clusters 0.2948026313873311 Score of 8 clusters 0.2891981619529679 Score of 9 clusters 0.2708075624042717
We can see an elbow around k=5 or k=6. Since the silhouette scorre is more for k=6 and then reduces, the final model has 6 clusters
model=KMeans(6) #Kmeans model
model.fit(df_scaled)
prediction=model.predict(df_scaled)
df4=df3.copy(deep=True)
df1_scaled=df_scaled.copy(deep=True)
df4['GROUP']= prediction #Making a column for labels after clustering
df1_scaled["GROUP"] = prediction
df4.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | GROUP | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 1 |
df1_scaled.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | GROUP | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.706439 | 1.498191 | 1.090604 | 0.673118 | 0.630870 | -1.295498 | -1.627426 | -0.715145 | 1 |
| 1 | -1.090751 | 1.498191 | 1.503514 | 1.589958 | 0.854333 | -1.477038 | -1.627426 | -0.715145 | 1 |
| 2 | -0.706439 | 1.498191 | 1.196232 | 1.197027 | 0.550470 | -1.658577 | -1.627426 | -0.715145 | 1 |
| 3 | -0.962647 | 1.498191 | 1.061796 | 1.197027 | 0.546923 | -1.295498 | -1.627426 | -0.715145 | 1 |
| 4 | -0.834543 | 1.498191 | 1.042591 | 0.935072 | 0.565841 | -1.840117 | -1.627426 | -0.715145 | 1 |
df5 = df4.groupby(['GROUP'])
df5.mean()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| GROUP | ||||||||
| 0 | 24.963077 | 4.138462 | 110.046154 | 92.015385 | 2398.907692 | 14.924615 | 74.538462 | 2.492308 |
| 1 | 14.429787 | 8.000000 | 350.042553 | 162.393617 | 4157.978723 | 12.576596 | 73.468085 | 1.000000 |
| 2 | 19.181707 | 6.207317 | 232.829268 | 101.579268 | 3301.402439 | 16.674390 | 75.743902 | 1.048780 |
| 3 | 29.159649 | 4.175439 | 132.719298 | 84.719298 | 2521.578947 | 15.870175 | 80.000000 | 1.000000 |
| 4 | 24.566667 | 4.000000 | 111.893939 | 74.863636 | 2325.030303 | 18.657576 | 72.787879 | 1.393939 |
| 5 | 34.837313 | 4.044776 | 98.164179 | 69.223881 | 2153.417910 | 17.256716 | 79.522388 | 2.701493 |
sns.pairplot(data=df4, diag_kind='kde',hue='GROUP');
model = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='average')
model.fit(df_scaled)
labels = model.labels_
df6=df3.copy(deep=True)
df6['labels'] = labels
df6.head(5)
| mpg | cyl | disp | hp | wt | acc | yr | origin | labels | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 3 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 3 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 3 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 3 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 3 |
df6 = df6.groupby(['labels'])
df6.mean()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| labels | ||||||||
| 0 | 26.517687 | 4.061224 | 118.173469 | 84.751701 | 2404.870748 | 15.999320 | 76.258503 | 1.619048 |
| 1 | 33.781818 | 4.363636 | 124.000000 | 65.636364 | 2828.636364 | 21.818182 | 79.636364 | 2.000000 |
| 2 | 19.208451 | 6.112676 | 232.957746 | 98.415493 | 3277.098592 | 16.878873 | 75.352113 | 1.000000 |
| 3 | 14.684000 | 7.980000 | 345.470000 | 160.400000 | 4121.560000 | 12.702000 | 73.740000 | 1.000000 |
| 4 | 21.444444 | 6.000000 | 159.333333 | 119.222222 | 3073.555556 | 14.155556 | 77.777778 | 2.666667 |
| 5 | 34.398333 | 4.000000 | 95.450000 | 70.533333 | 2085.100000 | 16.803333 | 79.033333 | 2.850000 |
Z_average = linkage(df_scaled, metric='euclidean', method='average') #Finding the cophenet correlation coefficient
c, coph_dists = cophenet(Z_average , pdist(df_scaled))
print(c)
0.7554096381782244
Z_average = linkage(df_scaled, metric='euclidean', method='complete')
c, coph_dists = cophenet(Z_average , pdist(df_scaled))
print(c)
0.723784736927783
Z_average = linkage(df_scaled, metric='euclidean', method='ward')
c, coph_dists = cophenet(Z_average , pdist(df_scaled))
print(c)
0.681505889742128
dfa=pd.read_csv('/content/MyDrive/MyDrive/Dataset/vehicle.csv') #Reading the datset
dfa.head() #First 5 datapoints
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
dfa.shape #no of rows and columns in the dataset
(846, 19)
dfa.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
dfa.isnull().sum() #Check for null values
compactness 0 circularity 5 distance_circularity 4 radius_ratio 6 pr.axis_aspect_ratio 2 max.length_aspect_ratio 0 scatter_ratio 1 elongatedness 1 pr.axis_rectangularity 3 max.length_rectangularity 0 scaled_variance 3 scaled_variance.1 2 scaled_radius_of_gyration 2 scaled_radius_of_gyration.1 4 skewness_about 6 skewness_about.1 1 skewness_about.2 1 hollows_ratio 0 class 0 dtype: int64
dfa['class'].unique()
array(['van', 'car', 'bus'], dtype=object)
dfa['class']=dfa['class'].replace({'van':1,'car':2,'bus':3})
median_filler= lambda x: x.fillna(x.median()) #imputing the missing values
dfa=dfa.apply(median_filler,axis=0)
dfa.describe().T #Statistical summary
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | 93.678487 | 8.234474 | 73.0 | 87.00 | 93.0 | 100.00 | 119.0 |
| circularity | 846.0 | 44.823877 | 6.134272 | 33.0 | 40.00 | 44.0 | 49.00 | 59.0 |
| distance_circularity | 846.0 | 82.100473 | 15.741569 | 40.0 | 70.00 | 80.0 | 98.00 | 112.0 |
| radius_ratio | 846.0 | 168.874704 | 33.401356 | 104.0 | 141.00 | 167.0 | 195.00 | 333.0 |
| pr.axis_aspect_ratio | 846.0 | 61.677305 | 7.882188 | 47.0 | 57.00 | 61.0 | 65.00 | 138.0 |
| max.length_aspect_ratio | 846.0 | 8.567376 | 4.601217 | 2.0 | 7.00 | 8.0 | 10.00 | 55.0 |
| scatter_ratio | 846.0 | 168.887707 | 33.197710 | 112.0 | 147.00 | 157.0 | 198.00 | 265.0 |
| elongatedness | 846.0 | 40.936170 | 7.811882 | 26.0 | 33.00 | 43.0 | 46.00 | 61.0 |
| pr.axis_rectangularity | 846.0 | 20.580378 | 2.588558 | 17.0 | 19.00 | 20.0 | 23.00 | 29.0 |
| max.length_rectangularity | 846.0 | 147.998818 | 14.515652 | 118.0 | 137.00 | 146.0 | 159.00 | 188.0 |
| scaled_variance | 846.0 | 188.596927 | 31.360427 | 130.0 | 167.00 | 179.0 | 217.00 | 320.0 |
| scaled_variance.1 | 846.0 | 439.314421 | 176.496341 | 184.0 | 318.25 | 363.5 | 586.75 | 1018.0 |
| scaled_radius_of_gyration | 846.0 | 174.706856 | 32.546277 | 109.0 | 149.00 | 173.5 | 198.00 | 268.0 |
| scaled_radius_of_gyration.1 | 846.0 | 72.443262 | 7.468734 | 59.0 | 67.00 | 71.5 | 75.00 | 135.0 |
| skewness_about | 846.0 | 6.361702 | 4.903244 | 0.0 | 2.00 | 6.0 | 9.00 | 22.0 |
| skewness_about.1 | 846.0 | 12.600473 | 8.930962 | 0.0 | 5.00 | 11.0 | 19.00 | 41.0 |
| skewness_about.2 | 846.0 | 188.918440 | 6.152247 | 176.0 | 184.00 | 188.0 | 193.00 | 206.0 |
| hollows_ratio | 846.0 | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.00 | 211.0 |
| class | 846.0 | 2.022459 | 0.702130 | 1.0 | 2.00 | 2.0 | 3.00 | 3.0 |
sns.countplot(data=dfa,x='class');
plt.figure(figsize = (15,7))
sns.heatmap(dfa.corr(), cmap='plasma',annot=True, fmt='.2f');
sns.pairplot(dfa, diag_kind='kde');
plt.figure(figsize=(20,6));
plt.subplot(1,3,1);
sns.distplot(dfa['compactness'],color='green');
plt.title('compactness')
plt.subplot(1,3,2);
sns.distplot(dfa['circularity'],color='blue');
plt.title('circularity')
plt.subplot(1,3,3);
sns.distplot(dfa['distance_circularity'],color='red');
plt.title('distance_circularity')
plt.figure(figsize=(20,6));
plt.subplot(1,3,1);
sns.boxplot(y=dfa['radius_ratio'],color='yellow');
plt.title('radius_ratio')
plt.subplot(1,3,2);
sns.boxplot(y=dfa['scatter_ratio'],color='orange');
plt.title('scatter_ratio')
plt.subplot(1,3,3);
sns.boxplot(y=dfa['elongatedness'],color='red');
plt.title('elongatedness')
Text(0.5, 1.0, 'elongatedness')
The plots shows the distribution of various features of the dataset.
plt.figure(figsize=(20,6));
plt.subplot(1,3,1);
sns.distplot(dfa['radius_ratio'],color='green');
plt.title('radius_ratio')
plt.subplot(1,3,2);
sns.distplot(dfa['scatter_ratio'],color='blue');
plt.title('scatter_ratio')
plt.subplot(1,3,3);
sns.distplot(dfa['elongatedness'],color='red');
plt.title('elongatedness')
plt.figure(figsize=(20,6));
plt.subplot(1,3,1);
sns.boxplot(y=dfa['compactness'],color='yellow');
plt.title('compactness')
plt.subplot(1,3,2);
sns.boxplot(y=dfa['circularity'],color='orange');
plt.title('circularity')
plt.subplot(1,3,3);
sns.boxplot(y=dfa['distance_circularity'],color='red');
plt.title('distance_circularity')
Text(0.5, 1.0, 'distance_circularity')
The plots shows the distribution of various features of the dataset
sns.scatterplot(data=dfa,x='compactness',y='scatter_ratio',hue=dfa['class']);
sns.jointplot(data=dfa,x='radius_ratio',y='scaled_variance',hue=dfa['class']);
dfa.groupby('class').size().plot(kind='pie',autopct='%1.0f%%',labels=['Van','Car','Bus']);
plt.title('Class')
Text(0.5, 1.0, 'Class')
dfa.columns
Index(['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio', 'class'],
dtype='object')
x=dfa.drop('class',axis=1)
y=dfa['class']
x=x.apply(zscore) #Standardize the data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.30, random_state=1) #Split the data into train and test data of 70:30 ratio
print('X Train set contains {} data'.format(x_train.shape))
print('X Test set contains {} data'.format(x_test.shape))
print('Y Train set contains {} data'.format(y_train.shape))
print('Y Test set contains {} data'.format(y_test.shape))
X Train set contains (592, 18) data X Test set contains (254, 18) data Y Train set contains (592,) data Y Test set contains (254,) data
svm_model = svm.SVC(gamma = 0.025,C=3)
svm_model.fit(x_train,y_train)
svm_y_pred = svm_model.predict(x_test)
print('Accuracy score for SVM Model is {}'.format(accuracy_score(y_test,svm_y_pred)))
Accuracy score for SVM Model is 0.9566929133858267
cm = confusion_matrix(y_test,svm_y_pred)
cm_df = pd.DataFrame(cm, index= [i for i in ['1','2','3']],columns= [i for i in ['Predicted_1','Predicted_2','Predicted_3']])
plt.figure(figsize=(7,5))
sns.heatmap(cm_df,annot=True,fmt='g');
covMatrix = np.cov(x,rowvar=False)
print(covMatrix)
[[ 1.00118343 0.68569786 0.79086299 0.69055952 0.09164265 0.14842463 0.81358214 -0.78968322 0.81465658 0.67694334 0.76297234 0.81497566 0.58593517 -0.24988794 0.23635777 0.15720044 0.29889034 0.36598446] [ 0.68569786 1.00118343 0.79325751 0.6216467 0.15396023 0.25176438 0.8489411 -0.82244387 0.84439802 0.96245572 0.79724837 0.83693508 0.92691166 0.05200785 0.14436828 -0.01145212 -0.10455005 0.04640562] [ 0.79086299 0.79325751 1.00118343 0.76794246 0.15864319 0.26499957 0.90614687 -0.9123854 0.89408198 0.77544391 0.86253904 0.88706577 0.70660663 -0.22621115 0.1140589 0.26586088 0.14627113 0.33312625] [ 0.69055952 0.6216467 0.76794246 1.00118343 0.66423242 0.45058426 0.73529816 -0.79041561 0.70922371 0.56962256 0.79435372 0.71928618 0.53700678 -0.18061084 0.04877032 0.17394649 0.38266622 0.47186659] [ 0.09164265 0.15396023 0.15864319 0.66423242 1.00118343 0.64949139 0.10385472 -0.18325156 0.07969786 0.1270594 0.27323306 0.08929427 0.12211524 0.15313091 -0.05843967 -0.0320139 0.24016968 0.26804208] [ 0.14842463 0.25176438 0.26499957 0.45058426 0.64949139 1.00118343 0.16638787 -0.18035326 0.16169312 0.30630475 0.31933428 0.1434227 0.18996732 0.29608463 0.01561769 0.04347324 -0.02611148 0.14408905] [ 0.81358214 0.8489411 0.90614687 0.73529816 0.10385472 0.16638787 1.00118343 -0.97275069 0.99092181 0.81004084 0.94978498 0.9941867 0.80082111 -0.02757446 0.07454578 0.21267959 0.00563439 0.1189581 ] [-0.78968322 -0.82244387 -0.9123854 -0.79041561 -0.18325156 -0.18035326 -0.97275069 1.00118343 -0.95011894 -0.77677186 -0.93748998 -0.95494487 -0.76722075 0.10342428 -0.05266193 -0.18527244 -0.11526213 -0.2171615 ] [ 0.81465658 0.84439802 0.89408198 0.70922371 0.07969786 0.16169312 0.99092181 -0.95011894 1.00118343 0.81189327 0.93533261 0.98938264 0.79763248 -0.01551372 0.08386628 0.21495454 -0.01867064 0.09940372] [ 0.67694334 0.96245572 0.77544391 0.56962256 0.1270594 0.30630475 0.81004084 -0.77677186 0.81189327 1.00118343 0.74586628 0.79555492 0.86747579 0.04167099 0.13601231 0.00136727 -0.10407076 0.07686047] [ 0.76297234 0.79724837 0.86253904 0.79435372 0.27323306 0.31933428 0.94978498 -0.93748998 0.93533261 0.74586628 1.00118343 0.94679667 0.77983844 0.11321163 0.03677248 0.19446837 0.01423606 0.08579656] [ 0.81497566 0.83693508 0.88706577 0.71928618 0.08929427 0.1434227 0.9941867 -0.95494487 0.98938264 0.79555492 0.94679667 1.00118343 0.79595778 -0.01541878 0.07696823 0.20104818 0.00622636 0.10305714] [ 0.58593517 0.92691166 0.70660663 0.53700678 0.12211524 0.18996732 0.80082111 -0.76722075 0.79763248 0.86747579 0.77983844 0.79595778 1.00118343 0.19169941 0.16667971 -0.05621953 -0.22471583 -0.11814142] [-0.24988794 0.05200785 -0.22621115 -0.18061084 0.15313091 0.29608463 -0.02757446 0.10342428 -0.01551372 0.04167099 0.11321163 -0.01541878 0.19169941 1.00118343 -0.08846001 -0.12633227 -0.749751 -0.80307227] [ 0.23635777 0.14436828 0.1140589 0.04877032 -0.05843967 0.01561769 0.07454578 -0.05266193 0.08386628 0.13601231 0.03677248 0.07696823 0.16667971 -0.08846001 1.00118343 -0.03503155 0.1154338 0.09724079] [ 0.15720044 -0.01145212 0.26586088 0.17394649 -0.0320139 0.04347324 0.21267959 -0.18527244 0.21495454 0.00136727 0.19446837 0.20104818 -0.05621953 -0.12633227 -0.03503155 1.00118343 0.07740174 0.20523257] [ 0.29889034 -0.10455005 0.14627113 0.38266622 0.24016968 -0.02611148 0.00563439 -0.11526213 -0.01867064 -0.10407076 0.01423606 0.00622636 -0.22471583 -0.749751 0.1154338 0.07740174 1.00118343 0.89363767] [ 0.36598446 0.04640562 0.33312625 0.47186659 0.26804208 0.14408905 0.1189581 -0.2171615 0.09940372 0.07686047 0.08579656 0.10305714 -0.11814142 -0.80307227 0.09724079 0.20523257 0.89363767 1.00118343]]
pca = PCA(n_components=18)
pca.fit(x)
PCA(n_components=18)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA(n_components=18)
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.axhline(y=0.9,color='red')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
pca3 = PCA(n_components=10)
pca3.fit(x)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
xpca = pca3.transform(x)
[[ 2.75283688e-01 2.93258469e-01 3.04609128e-01 2.67606877e-01 8.05039890e-02 9.72756855e-02 3.17092750e-01 -3.14133155e-01 3.13959064e-01 2.82830900e-01 3.09280359e-01 3.13788457e-01 2.72047492e-01 -2.08137692e-02 4.14555082e-02 5.82250207e-02 3.02795063e-02 7.41453913e-02] [-1.26953763e-01 1.25576727e-01 -7.29516436e-02 -1.89634378e-01 -1.22174860e-01 1.07482875e-02 4.81181371e-02 1.27498515e-02 5.99352482e-02 1.16220532e-01 6.22806229e-02 5.37843596e-02 2.09233172e-01 4.88525148e-01 -5.50899716e-02 -1.24085090e-01 -5.40914775e-01 -5.40354258e-01] [-1.19922479e-01 -2.48205467e-02 -5.60143254e-02 2.75074211e-01 6.42012966e-01 5.91801304e-01 -9.76283108e-02 5.76484384e-02 -1.09512416e-01 -1.70641987e-02 5.63239801e-02 -1.08840729e-01 -3.14636493e-02 2.86277015e-01 -1.15679354e-01 -7.52828901e-02 8.73592034e-03 3.95242743e-02] [ 7.83843562e-02 1.87337408e-01 -7.12008427e-02 -4.26053415e-02 3.27257119e-02 3.14147277e-02 -9.57485748e-02 8.22901952e-02 -9.24582989e-02 1.88005612e-01 -1.19844008e-01 -9.17449325e-02 2.00095228e-01 -6.55051354e-02 6.04794251e-01 -6.66114117e-01 1.05526253e-01 4.74890311e-02] [ 6.95178336e-02 -8.50649539e-02 4.06645651e-02 -4.61473714e-02 -4.05494487e-02 2.13432566e-01 -1.54853055e-02 7.68518712e-02 2.17633157e-03 -6.06366845e-02 -4.56472367e-04 -1.95548315e-02 -6.15991681e-02 1.45530146e-01 7.29189842e-01 5.99196401e-01 -1.00602332e-01 -2.98614819e-02] [ 1.44875476e-01 -3.02731148e-01 -1.38405773e-01 2.48136636e-01 2.36932611e-01 -4.19330747e-01 1.16100153e-01 -1.41840112e-01 9.80561329e-02 -4.61674972e-01 2.36225434e-01 1.57820194e-01 -1.35576278e-01 2.41356821e-01 2.03209257e-01 -1.91960802e-01 1.56939174e-01 -2.41222817e-01] [ 4.51862331e-01 -2.49103387e-01 7.40350569e-02 -1.76912814e-01 -3.97876601e-01 5.03413610e-01 6.49879382e-02 1.38112945e-02 9.66573058e-02 -1.04552173e-01 1.14622578e-01 8.37350220e-02 -3.73944382e-01 1.11952983e-01 -8.06328902e-02 -2.84558723e-01 1.81451818e-02 1.57237839e-02] [-5.66136785e-01 -1.79851809e-01 4.34748988e-01 1.01998360e-01 -6.87147927e-02 1.61153097e-01 1.00688056e-01 -2.15497166e-01 6.35933915e-02 -2.49495867e-01 5.02096319e-02 4.37649907e-02 -1.08474496e-01 -3.40878491e-01 1.56487670e-01 -2.08774083e-01 -3.04580219e-01 -3.04186304e-02] [-4.84418105e-01 -1.41569001e-02 -1.67572478e-01 -2.30313563e-01 -2.77128307e-01 1.48032250e-01 5.44574214e-02 -1.56867362e-01 5.24978759e-03 -6.10362445e-02 2.97588112e-01 8.33669838e-02 2.41655483e-01 3.20221887e-01 2.21054148e-02 1.01761758e-02 5.17222779e-01 1.71506343e-01] [-2.60076393e-01 9.80779086e-02 -2.05031597e-01 -4.77888949e-02 1.08075009e-01 -1.18266345e-01 1.65167200e-01 -1.51612333e-01 1.93777917e-01 4.69059999e-01 -1.29986011e-01 1.58203940e-01 -6.86493700e-01 1.27648385e-01 9.83643219e-02 -3.55150608e-02 1.93956186e-02 6.41314778e-02]] [0.52186034 0.16729768 0.10562639 0.0654746 0.05089869 0.02996413 0.01991366 0.01231501 0.00891215 0.00509715]
sns.pairplot(pd.DataFrame(xpca),diag_kind='kde');
We can see there is no correlation between the variables on using PCA.
x_train, x_test, y_train, y_test = train_test_split(xpca,y, test_size=0.30, random_state=1) #Split the data into train and test data of 70:30 ratio
print('X Train set contains {} data'.format(x_train.shape))
print('X Test set contains {} data'.format(x_test.shape))
print('Y Train set contains {} data'.format(y_train.shape))
print('Y Test set contains {} data'.format(y_test.shape))
X Train set contains (592, 10) data X Test set contains (254, 10) data Y Train set contains (592,) data Y Test set contains (254,) data
svm_model = svm.SVC(gamma = 0.025,C=3)
svm_model.fit(x_train,y_train)
svm_y_pred = svm_model.predict(x_test)
print('Accuracy score for SVM Model is {}'.format(accuracy_score(y_test,svm_y_pred)))
Accuracy score for SVM Model is 0.952755905511811
print(classification_report(y_test, svm_y_pred))
precision recall f1-score support
1 0.93 0.90 0.92 62
2 0.98 0.97 0.97 133
3 0.92 0.97 0.94 59
accuracy 0.95 254
macro avg 0.94 0.95 0.94 254
weighted avg 0.95 0.95 0.95 254
cm = confusion_matrix(y_test,svm_y_pred)
cm_df = pd.DataFrame(cm, index= [i for i in ['1','2','3']],columns= [i for i in ['Predicted_1','Predicted_2','Predicted_3']])
plt.figure(figsize=(7,5))
sns.heatmap(cm_df,annot=True,fmt='g');
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid = GridSearchCV(svm.SVC(),param_grid,refit= True, verbose =0)
grid.fit(x_train,y_train)
GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']})SVC()
SVC()
print(grid.best_params_)
print(grid.best_estimator_)
{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, gamma=0.1)
grid_y_pred = grid.predict(x_test)
print('Accuracy score for SVM Model is %1.2f'%accuracy_score(y_test,grid_y_pred))
Accuracy score for SVM Model is 0.93
A few of the assumptions in PCA are:
Advantages of using PCA:
Limitations of using PCA: